{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee95bb0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install graphdatascience"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0342d05c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from graphdatascience import GraphDataScience\n",
    "import pandas as pd\n",
    "\n",
    "host = \"bolt://localhost:7687\"\n",
    "user = \"neo4j\"\n",
    "password= \"pleaseletmein\"\n",
    "\n",
    "gds = GraphDataScience(host, auth=(user, password))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7b0dde99",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Constraints\n",
    "gds.run_cypher(\"\"\"\n",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (a:Article) REQUIRE a.url IS UNIQUE;\n",
    "\"\"\")\n",
    "gds.run_cypher(\"\"\"\n",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (a:Author) REQUIRE a.name IS UNIQUE;\n",
    "\"\"\")\n",
    "gds.run_cypher(\"\"\"\n",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (t:Tag) REQUIRE t.name IS UNIQUE;\n",
    "\"\"\")\n",
    "gds.run_cypher(\"\"\"\n",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (l:List) REQUIRE l.id IS UNIQUE;\n",
    "\"\"\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1415312f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Import articles\n",
    "gds.run_cypher(\"\"\"\n",
    "LOAD CSV WITH HEADERS FROM \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/medium/medium_articles.csv\" AS row\n",
    "CALL {\n",
    "     WITH row\n",
    "     MERGE (a:Article {url: row.story})\n",
    "     SET a.title = row.title,\n",
    "         a.date = date(CASE WHEN row.date <> \"\" THEN row.date ELSE \"1990-01-01\" END)\n",
    "     MERGE (au:Author {name:coalesce(row.author, \"Unknown\")})\n",
    "     MERGE (au)-[:WROTE]->(a)\n",
    "     WITH a, apoc.convert.fromJsonList(row.tags) AS tags\n",
    "     UNWIND tags AS tag\n",
    "     MERGE (t:Tag {name:tag})\n",
    "     MERGE (a)-[:HAS_TAG]->(t)\n",
    "} IN TRANSACTIONS\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7358034c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Import lists\n",
    "gds.run_cypher(\"\"\"\n",
    "LOAD CSV WITH HEADERS FROM \"https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/medium/medium_lists.csv\" AS row\n",
    "CALL {\n",
    "    WITH row\n",
    "    MATCH (a:Article {url: row.article})\n",
    "    MERGE (l:List {id: row.list})\n",
    "    MERGE (a)-[:IN_LIST]->(l)\n",
    "} IN TRANSACTIONS\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13da3fa9",
   "metadata": {},
   "source": [
    "Download and unzip the embeddings file in this folder.\n",
    "The embeddings can be downloaded using the following link:\n",
    "    https://drive.google.com/file/d/1nrKTJwEBf0OhRcmDr0cXq2qMw3k_jbtM/view?usp=share_link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f49cada4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import openAI embeddings\n",
    "embeddings = pd.read_csv('medium_embedding.csv')\n",
    "params = embeddings.values\n",
    "for i in range(0, len(params), 1000):\n",
    "    batch = [{'url':x[0], 'embeddings':x[1]} for x in params[i:i+1000]]\n",
    "    gds.run_cypher(\"\"\"\n",
    "      UNWIND $data AS row \n",
    "      MATCH (a:Article {url:row.url}) \n",
    "      SET a.openaiEmbedding = apoc.convert.fromJsonList(row.embeddings)\"\"\", \n",
    "                   {'data': batch})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b744b781",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
